This files contains an example of tuning an XGBoost model with BayesSearchCV.
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train, return_style=True)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train, return_style=True)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])
array([[0., 2.],
[2., 2.],
[9., 1.],
...,
[9., 3.],
[6., 4.],
[6., 2.]])
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)
['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents'] ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
numeric_pipeline = Pipeline([
#tune whether or not we want to impute or simply remove rows with missing values
('imputer', hlp.sklearn_pipeline.TransformerChooser()),
# this is here so that we can select between MinMax and Scaler
# if this pipeline is ran in a context outside of tuning, no transformation will take place
('scaler', hlp.sklearn_pipeline.TransformerChooser()),
])
non_numeric_pipeline = Pipeline([
('encoder', hlp.sklearn_pipeline.TransformerChooser()),
])
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
('numeric', numeric_pipeline, numeric_columns),
('non_numeric', non_numeric_pipeline, non_numeric_columns)
])
XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded on Apple Silicon (ARM)
https://github.com/dmlc/xgboost/issues/6909
pip install --upgrade --force-reinstall xgboost --no-binary xgboost -v
from xgboost import XGBClassifier
#from sklearn.decomposition import KernelPCA
xgb_model = XGBClassifier(random_state=42,
eval_metric='logloss',
use_label_encoder=False)
full_pipeline = Pipeline([
('prep', transformations_pipeline),
# ('pca', KernelPCA()),
# ('pca', hlp.sklearn_pipeline.TransformerChooser()),
('model', xgb_model)
])
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps
{'prep': ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser()),
('scaler',
TransformerChooser())]),
['duration', 'credit_amount',
'installment_commitment', 'residence_since',
'age', 'existing_credits',
'num_dependents']),
('non_numeric',
Pipeline(steps=[('encoder',
TransformerChooser())]),
['checking_status', 'credit_history',
'purpose', 'savings_status', 'employment',
'personal_status', 'other_parties',
'property_magnitude', 'other_payment_plans',
'housing', 'job', 'own_telephone',
'foreign_worker'])]),
'model': XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
colsample_bynode=None, colsample_bytree=None,
enable_categorical=False, eval_metric='logloss', gamma=None,
gpu_id=None, importance_type=None, interaction_constraints=None,
learning_rate=None, max_delta_step=None, max_depth=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, reg_alpha=None, reg_lambda=None,
scale_pos_weight=None, subsample=None, tree_method=None,
use_label_encoder=False, validate_parameters=None,
verbosity=None)}
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score # , roc_auc_score
from sklearn.metrics import SCORERS
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve
scores = {
# https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
'ROC/AUC': SCORERS['roc_auc'],
'F1': make_scorer(f1_score, greater_is_better=True),
'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}
num_folds = 5
num_repeats = 2
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663
max_depth: 3–10 n_estimators: 100 (lots of observations) to 1000 (few observations) learning_rate: 0.01–0.3 colsample_bytree: 0.5–1 subsample: 0.6–1
Then, you can focus on optimizing max_depth and n_estimators. You can then play along with the learning_rate, and increase it to speed up the model without decreasing the performances. If it becomes faster without losing in performances, you can increase the number of estimators to try to increase the performances.
search_space = {
'prep__numeric__imputer__transformer': Categorical([SimpleImputer(strategy='mean')]),
'prep__numeric__scaler__transformer': Categorical([
None,
MinMaxScaler(),
StandardScaler()
]),
'prep__non_numeric__encoder__transformer': Categorical([
# None,
OneHotEncoder(),
hlp.sklearn_pipeline.CustomOrdinalEncoder()
]),
# 'pca__transformer': Categorical([
# None,
# KernelPCA(n_components=5, kernel='rbf'),
# KernelPCA(n_components=5, kernel='sigmoid'),
# KernelPCA(n_components=5, kernel='linear'),
# ]),
# 'pca__n_components': Integer(3, X_train.shape[1]),
# 'pca__gamma': Real(0.03, 0.05),
# 'pca__kernel': Categorical(['rbf', 'sigmoid']),
'model__max_depth': Integer(3, 10),
'model__n_estimators': Integer(50, 2000),
'model__learning_rate': Real(0.01, 0.3),
'model__colsample_bytree': Real(0.01, 1),
'model__subsample': Real(0.1, 1),
}
bayes_search = BayesSearchCV(
estimator=full_pipeline,
search_spaces=search_space,
n_iter=50,
cv=RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats),
scoring='roc_auc',
#return_train_score=True,
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
del search_space
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 139.866 seconds; 2.3 minutes
print(bayes_search.cv_results_)
{'mean_fit_time': array([0.91540077, 2.97680652, 0.61755981, 3.1776381 , 1.42760015,
0.6581629 , 0.98548501, 1.42064655, 1.42922101, 0.14844317,
2.76263421, 0.04913921, 0.02552745, 0.32107105, 0.22884459,
2.78207996, 0.15752764, 0.63771639, 0.03724518, 0.20636985,
0.85205717, 0.08833435, 0.06047475, 0.03386319, 0.63402202,
0.04324572, 0.04619427, 0.82597058, 1.41828265, 0.37768147,
0.34136021, 0.0334723 , 0.44540069, 0.09616776, 0.30100787,
0.03956122, 0.99580045, 0.10397882, 0.5058147 , 1.00126624,
0.31424382, 0.29258344, 1.07777073, 3.56608882, 0.36573451,
0.90820949, 6.77010171, 0.0265013 , 0.39557285, 0.28505092]), 'std_fit_time': array([0.1418178 , 0.52420301, 0.08513566, 0.46460976, 0.26510655,
0.10411318, 0.13303593, 0.18725435, 0.17496685, 0.01046407,
0.39407714, 0.00721641, 0.00677767, 0.03681997, 0.03009434,
0.3758923 , 0.0242508 , 0.10039483, 0.00868249, 0.02981395,
0.11994175, 0.01536929, 0.01006379, 0.01105708, 0.09024344,
0.01361256, 0.01156157, 0.0966723 , 0.21079415, 0.0639985 ,
0.04511175, 0.00894199, 0.06563285, 0.01669314, 0.02883129,
0.00982728, 0.14194503, 0.01903422, 0.06297069, 0.14402215,
0.04138179, 0.03767253, 0.14912455, 0.5173747 , 0.04607639,
0.13635969, 1.01152921, 0.00604143, 0.06454258, 0.04068234]), 'mean_score_time': array([0.01459444, 0.01303897, 0.00980735, 0.02244353, 0.02817004,
0.01091731, 0.02072327, 0.01231446, 0.01606815, 0.01152391,
0.01802149, 0.00831888, 0.009288 , 0.03209736, 0.03272071,
0.04756904, 0.02388415, 0.04291885, 0.01680632, 0.02019486,
0.02799222, 0.01000443, 0.020525 , 0.00854397, 0.02616282,
0.01970212, 0.00748739, 0.03466244, 0.01755385, 0.01306658,
0.01923714, 0.0101151 , 0.01026218, 0.0089638 , 0.00962086,
0.01666729, 0.03422339, 0.01439393, 0.02956877, 0.01784339,
0.02078269, 0.0156868 , 0.01640246, 0.01901066, 0.01542134,
0.01825612, 0.06605358, 0.00939789, 0.00850267, 0.02118921]), 'std_score_time': array([0.00414726, 0.00349136, 0.00271789, 0.00638556, 0.00770665,
0.00476504, 0.00545635, 0.00367979, 0.00866763, 0.00483114,
0.00404481, 0.00342618, 0.00341193, 0.01765097, 0.00952663,
0.01342095, 0.01075763, 0.02105058, 0.00409524, 0.00603711,
0.00457716, 0.00372531, 0.00605053, 0.00268551, 0.00833575,
0.00597634, 0.00167056, 0.01976417, 0.00772596, 0.00479096,
0.0062484 , 0.0049319 , 0.00468153, 0.00247406, 0.00439606,
0.00474385, 0.00877549, 0.01044416, 0.00925831, 0.00702363,
0.00977621, 0.00844375, 0.00592444, 0.00687851, 0.00652977,
0.01187972, 0.049867 , 0.00429565, 0.00288394, 0.00593148]), 'param_model__colsample_bytree': masked_array(data=[0.4160029192647807, 0.8390144719977516,
0.4503841871781403, 0.8142720284737898,
0.8015579071911014, 0.7366877378057127,
0.6209085649172932, 0.5479690370134094,
0.955923206446829, 0.013594004182195795, 1.0, 1.0,
0.01, 0.2765161505401894, 0.13185918812684402,
0.9063493444677292, 1.0, 0.01, 0.01, 0.01,
0.3269777094852202, 0.19261050437997468,
0.2634368894209554, 0.01, 0.17235665761252472,
0.1190241261491851, 0.10802360484262012,
0.18799751224554667, 0.43598085034798706,
0.011565210663852493, 0.39547614449804336,
0.05423850667826528, 0.01, 0.30511081127204787, 0.01,
0.06751384407951891, 0.387868802856545, 0.01,
0.036082306338768945, 0.5200165232955171, 1.0,
0.5936524097357171, 0.4564883967636975,
0.9977496236899784, 0.6863126755861281,
0.4714312428507383, 0.8211016922033352, 0.01, 0.01,
0.40032425080985073],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_model__learning_rate': masked_array(data=[0.22104046552142426, 0.26616143044045004,
0.27642953128411935, 0.059842752805002605,
0.13702846406786778, 0.2824172239048038,
0.23485566986008594, 0.27680267566682176,
0.21318021894019612, 0.24717070570248795, 0.01, 0.01,
0.01, 0.010924480728249177, 0.011905257050905997,
0.012747660567312864, 0.01, 0.01, 0.01, 0.01, 0.01,
0.059835082112581366, 0.01, 0.01, 0.01,
0.062050875825482285, 0.046847382834046145,
0.08418627367702325, 0.01, 0.09182754020407823,
0.0782708025863502, 0.032406068301243984,
0.05785344656357238, 0.03570963521716221,
0.13041356321370637, 0.025959858191703787,
0.04175256464911765, 0.057016006987802174,
0.2996550763351218, 0.01, 0.3, 0.010807160619847623,
0.01, 0.09822113379390658, 0.01028335197790437,
0.0685872815010209, 0.01, 0.01, 0.01, 0.01],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_model__max_depth': masked_array(data=[10, 5, 4, 7, 7, 4, 6, 6, 9, 8, 3, 3, 4, 10, 8, 10, 8,
9, 9, 10, 7, 8, 7, 9, 10, 8, 6, 6, 3, 3, 3, 7, 3, 7, 3,
7, 10, 9, 10, 10, 5, 8, 3, 7, 9, 3, 10, 3, 4, 10],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_model__n_estimators': masked_array(data=[666, 1905, 895, 1615, 1446, 417, 1755, 1683, 861, 546,
2000, 50, 50, 244, 249, 1439, 50, 2000, 50, 680, 864,
50, 50, 50, 2000, 50, 50, 917, 1742, 1250, 809, 50,
2000, 50, 1252, 50, 964, 327, 1736, 895, 246, 160,
1642, 1685, 276, 1546, 2000, 50, 1516, 270],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_model__subsample': masked_array(data=[0.7031331534420412, 0.8777151239184556,
0.2691596677306137, 0.5707470952489453,
0.913466865256018, 0.8204529083110704,
0.633357707775011, 0.3916539840559363,
0.44295828831774353, 0.6207979026055463, 1.0, 0.1, 1.0,
0.97682295768335, 0.9494804275407859,
0.9902507316010832, 1.0, 0.6096886064502337,
0.3720690567052698, 1.0, 1.0, 1.0, 0.8037538422634443,
1.0, 0.11922595565284902, 0.1, 1.0, 1.0,
0.8895892298826407, 0.18881318667953775,
0.8410824635124519, 1.0, 0.1, 1.0, 0.1, 1.0, 1.0,
0.15401417659462413, 0.7348272768200084,
0.2008060783709453, 0.72623139604702,
0.2943055685322117, 0.297742730146211,
0.7364360184651706, 0.16603223754272423,
0.16513689905232265, 0.8599569395956452, 0.1,
0.42243111335114936, 0.424738803519765],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_prep__non_numeric__encoder__transformer': masked_array(data=[OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_prep__numeric__imputer__transformer': masked_array(data=[SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'param_prep__numeric__scaler__transformer': masked_array(data=[MinMaxScaler(), MinMaxScaler(), StandardScaler(),
StandardScaler(), MinMaxScaler(), MinMaxScaler(),
MinMaxScaler(), None, MinMaxScaler(), None,
StandardScaler(), StandardScaler(), None, None,
StandardScaler(), StandardScaler(), StandardScaler(),
MinMaxScaler(), StandardScaler(), StandardScaler(),
StandardScaler(), MinMaxScaler(), StandardScaler(),
StandardScaler(), MinMaxScaler(), StandardScaler(),
StandardScaler(), MinMaxScaler(), StandardScaler(),
StandardScaler(), StandardScaler(), MinMaxScaler(),
MinMaxScaler(), MinMaxScaler(), MinMaxScaler(),
MinMaxScaler(), StandardScaler(), MinMaxScaler(),
StandardScaler(), MinMaxScaler(), StandardScaler(),
MinMaxScaler(), MinMaxScaler(), MinMaxScaler(), None,
MinMaxScaler(), StandardScaler(), StandardScaler(),
StandardScaler(), MinMaxScaler()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False],
fill_value='?',
dtype=object), 'params': [OrderedDict([('model__colsample_bytree', 0.4160029192647807), ('model__learning_rate', 0.22104046552142426), ('model__max_depth', 10), ('model__n_estimators', 666), ('model__subsample', 0.7031331534420412), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.8390144719977516), ('model__learning_rate', 0.26616143044045004), ('model__max_depth', 5), ('model__n_estimators', 1905), ('model__subsample', 0.8777151239184556), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.4503841871781403), ('model__learning_rate', 0.27642953128411935), ('model__max_depth', 4), ('model__n_estimators', 895), ('model__subsample', 0.2691596677306137), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.8142720284737898), ('model__learning_rate', 0.059842752805002605), ('model__max_depth', 7), ('model__n_estimators', 1615), ('model__subsample', 0.5707470952489453), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.8015579071911014), ('model__learning_rate', 0.13702846406786778), ('model__max_depth', 7), ('model__n_estimators', 1446), ('model__subsample', 0.913466865256018), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.7366877378057127), ('model__learning_rate', 0.2824172239048038), ('model__max_depth', 4), ('model__n_estimators', 417), ('model__subsample', 0.8204529083110704), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.6209085649172932), ('model__learning_rate', 0.23485566986008594), ('model__max_depth', 6), ('model__n_estimators', 1755), ('model__subsample', 0.633357707775011), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.5479690370134094), ('model__learning_rate', 0.27680267566682176), ('model__max_depth', 6), ('model__n_estimators', 1683), ('model__subsample', 0.3916539840559363), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.955923206446829), ('model__learning_rate', 0.21318021894019612), ('model__max_depth', 9), ('model__n_estimators', 861), ('model__subsample', 0.44295828831774353), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.013594004182195795), ('model__learning_rate', 0.24717070570248795), ('model__max_depth', 8), ('model__n_estimators', 546), ('model__subsample', 0.6207979026055463), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.01), ('model__max_depth', 3), ('model__n_estimators', 2000), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.01), ('model__max_depth', 3), ('model__n_estimators', 50), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 4), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.2765161505401894), ('model__learning_rate', 0.010924480728249177), ('model__max_depth', 10), ('model__n_estimators', 244), ('model__subsample', 0.97682295768335), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.13185918812684402), ('model__learning_rate', 0.011905257050905997), ('model__max_depth', 8), ('model__n_estimators', 249), ('model__subsample', 0.9494804275407859), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.9063493444677292), ('model__learning_rate', 0.012747660567312864), ('model__max_depth', 10), ('model__n_estimators', 1439), ('model__subsample', 0.9902507316010832), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.01), ('model__max_depth', 8), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 9), ('model__n_estimators', 2000), ('model__subsample', 0.6096886064502337), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 9), ('model__n_estimators', 50), ('model__subsample', 0.3720690567052698), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 680), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.3269777094852202), ('model__learning_rate', 0.01), ('model__max_depth', 7), ('model__n_estimators', 864), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.19261050437997468), ('model__learning_rate', 0.059835082112581366), ('model__max_depth', 8), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.2634368894209554), ('model__learning_rate', 0.01), ('model__max_depth', 7), ('model__n_estimators', 50), ('model__subsample', 0.8037538422634443), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 9), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.17235665761252472), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 2000), ('model__subsample', 0.11922595565284902), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.1190241261491851), ('model__learning_rate', 0.062050875825482285), ('model__max_depth', 8), ('model__n_estimators', 50), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.10802360484262012), ('model__learning_rate', 0.046847382834046145), ('model__max_depth', 6), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.18799751224554667), ('model__learning_rate', 0.08418627367702325), ('model__max_depth', 6), ('model__n_estimators', 917), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.43598085034798706), ('model__learning_rate', 0.01), ('model__max_depth', 3), ('model__n_estimators', 1742), ('model__subsample', 0.8895892298826407), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.011565210663852493), ('model__learning_rate', 0.09182754020407823), ('model__max_depth', 3), ('model__n_estimators', 1250), ('model__subsample', 0.18881318667953775), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.39547614449804336), ('model__learning_rate', 0.0782708025863502), ('model__max_depth', 3), ('model__n_estimators', 809), ('model__subsample', 0.8410824635124519), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.05423850667826528), ('model__learning_rate', 0.032406068301243984), ('model__max_depth', 7), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.05785344656357238), ('model__max_depth', 3), ('model__n_estimators', 2000), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.30511081127204787), ('model__learning_rate', 0.03570963521716221), ('model__max_depth', 7), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.13041356321370637), ('model__max_depth', 3), ('model__n_estimators', 1252), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.06751384407951891), ('model__learning_rate', 0.025959858191703787), ('model__max_depth', 7), ('model__n_estimators', 50), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.387868802856545), ('model__learning_rate', 0.04175256464911765), ('model__max_depth', 10), ('model__n_estimators', 964), ('model__subsample', 1.0), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.057016006987802174), ('model__max_depth', 9), ('model__n_estimators', 327), ('model__subsample', 0.15401417659462413), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.036082306338768945), ('model__learning_rate', 0.2996550763351218), ('model__max_depth', 10), ('model__n_estimators', 1736), ('model__subsample', 0.7348272768200084), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.5200165232955171), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 895), ('model__subsample', 0.2008060783709453), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 1.0), ('model__learning_rate', 0.3), ('model__max_depth', 5), ('model__n_estimators', 246), ('model__subsample', 0.72623139604702), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.5936524097357171), ('model__learning_rate', 0.010807160619847623), ('model__max_depth', 8), ('model__n_estimators', 160), ('model__subsample', 0.2943055685322117), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.4564883967636975), ('model__learning_rate', 0.01), ('model__max_depth', 3), ('model__n_estimators', 1642), ('model__subsample', 0.297742730146211), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.9977496236899784), ('model__learning_rate', 0.09822113379390658), ('model__max_depth', 7), ('model__n_estimators', 1685), ('model__subsample', 0.7364360184651706), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.6863126755861281), ('model__learning_rate', 0.01028335197790437), ('model__max_depth', 9), ('model__n_estimators', 276), ('model__subsample', 0.16603223754272423), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__colsample_bytree', 0.4714312428507383), ('model__learning_rate', 0.0685872815010209), ('model__max_depth', 3), ('model__n_estimators', 1546), ('model__subsample', 0.16513689905232265), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__colsample_bytree', 0.8211016922033352), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 2000), ('model__subsample', 0.8599569395956452), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 3), ('model__n_estimators', 50), ('model__subsample', 0.1), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.01), ('model__learning_rate', 0.01), ('model__max_depth', 4), ('model__n_estimators', 1516), ('model__subsample', 0.42243111335114936), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__colsample_bytree', 0.40032425080985073), ('model__learning_rate', 0.01), ('model__max_depth', 10), ('model__n_estimators', 270), ('model__subsample', 0.424738803519765), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())])], 'split0_test_score': array([0.74821256, 0.75894945, 0.76542544, 0.74919977, 0.69986264,
0.67149758, 0.72072727, 0.66905188, 0.65616097, 0.69807195,
0.71079308, 0.72568487, 0.84302083, 0.70520581, 0.78761755,
0.72444444, 0.68496047, 0.76463943, 0.78146453, 0.79691076,
0.75208333, 0.7312974 , 0.80236364, 0.68821256, 0.79038282,
0.70003918, 0.79151763, 0.74701232, 0.80583991, 0.84007915,
0.67243304, 0.81012914, 0.74947589, 0.74181548, 0.73320619,
0.762907 , 0.76371083, 0.74716756, 0.64803386, 0.75296995,
0.73208249, 0.83581541, 0.72829976, 0.70821842, 0.80536068,
0.71540881, 0.6824784 , 0.60792425, 0.71937399, 0.82127273]), 'split1_test_score': array([0.65191324, 0.72279187, 0.6850524 , 0.72135266, 0.78146453,
0.68519481, 0.71728056, 0.65109091, 0.71961758, 0.69747565,
0.71925227, 0.69709091, 0.69329016, 0.78083223, 0.72545455,
0.78078692, 0.71763393, 0.6462624 , 0.69984744, 0.72254545,
0.73620787, 0.74157524, 0.7586142 , 0.73365617, 0.7427836 ,
0.69120693, 0.72374608, 0.74677003, 0.75455047, 0.71884058,
0.69509044, 0.76675325, 0.72075727, 0.76432292, 0.75324675,
0.74360783, 0.72463768, 0.65649797, 0.61259037, 0.72842262,
0.71650525, 0.69868847, 0.80528797, 0.72069264, 0.76328502,
0.67802236, 0.71844156, 0.62116362, 0.7468378 , 0.74942691]), 'split2_test_score': array([0.66229167, 0.70272727, 0.656 , 0.73796509, 0.75372689,
0.77312006, 0.7 , 0.71492165, 0.71521577, 0.76183333,
0.79325927, 0.75942029, 0.75698812, 0.75037202, 0.74761648,
0.75254702, 0.70245455, 0.77012108, 0.70690909, 0.74607653,
0.7440706 , 0.76934524, 0.73758454, 0.69405749, 0.70031256,
0.69442912, 0.81194196, 0.71436364, 0.7137746 , 0.72388218,
0.74255952, 0.79898119, 0.72622858, 0.766875 , 0.78632479,
0.76861285, 0.73457676, 0.76765204, 0.69271324, 0.73789346,
0.74510188, 0.74535916, 0.73753561, 0.6772138 , 0.72544643,
0.80286521, 0.7705721 , 0.61565599, 0.76201373, 0.7421875 ]), 'split3_test_score': array([0.69679654, 0.66828087, 0.72076822, 0.73211987, 0.67518956,
0.66531401, 0.6647619 , 0.70608172, 0.71551547, 0.7710114 ,
0.72115385, 0.77716492, 0.79186904, 0.75925926, 0.75458937,
0.70840079, 0.70355426, 0.74144345, 0.75620518, 0.75527772,
0.6592296 , 0.78470448, 0.75733753, 0.7661788 , 0.80140313,
0.76883117, 0.79427083, 0.66372774, 0.71671498, 0.71875 ,
0.74096279, 0.72484277, 0.80779169, 0.73116295, 0.7316064 ,
0.69963636, 0.65014989, 0.76097179, 0.71695019, 0.78983516,
0.75669643, 0.72033898, 0.77777778, 0.75334821, 0.75267094,
0.71569814, 0.72848804, 0.63650913, 0.79443173, 0.76326677]), 'split4_test_score': array([0.71223958, 0.73978535, 0.74216301, 0.73696102, 0.66070364,
0.73620787, 0.74656981, 0.62437948, 0.74509804, 0.65275293,
0.79400966, 0.77790179, 0.6894686 , 0.80472727, 0.79204893,
0.72903605, 0.72195858, 0.75297619, 0.76236364, 0.71991365,
0.799463 , 0.7862925 , 0.70570048, 0.76078869, 0.70165652,
0.77212052, 0.70865987, 0.77459954, 0.76270531, 0.78718729,
0.70307042, 0.73398169, 0.72409409, 0.76601831, 0.70865987,
0.78393957, 0.77017834, 0.79227813, 0.68273395, 0.78930109,
0.68413311, 0.8143551 , 0.73008242, 0.71207265, 0.75130435,
0.69773855, 0.74103738, 0.6014067 , 0.79854545, 0.73871583]), 'split5_test_score': array([0.74006116, 0.68640693, 0.69233666, 0.74568966, 0.70493506,
0.73971014, 0.69855072, 0.684651 , 0.65659341, 0.74935401,
0.76878177, 0.71022406, 0.76454545, 0.75890313, 0.72883117,
0.72940503, 0.78248588, 0.74054545, 0.75103022, 0.74745455,
0.78950605, 0.78384687, 0.73444643, 0.69685558, 0.76816911,
0.71397733, 0.73947417, 0.69645833, 0.73493304, 0.74739583,
0.70509091, 0.81232194, 0.70964487, 0.76872727, 0.61446572,
0.75416667, 0.76060268, 0.79187351, 0.63302907, 0.73488985,
0.74113548, 0.78274697, 0.81083143, 0.70750135, 0.76030879,
0.77219532, 0.75 , 0.68280193, 0.81499288, 0.74024105]), 'split6_test_score': array([0.7662037 , 0.72304258, 0.75071225, 0.75787012, 0.70435742,
0.67030691, 0.72355016, 0.76985507, 0.76164778, 0.68640693,
0.68910256, 0.73772321, 0.73225999, 0.80970982, 0.78666667,
0.74572386, 0.66318182, 0.76257862, 0.76741041, 0.7373376 ,
0.7699115 , 0.78819636, 0.79099924, 0.75035868, 0.84521739,
0.75060533, 0.82030319, 0.76218744, 0.77584739, 0.742 ,
0.7328869 , 0.73248759, 0.71565934, 0.75733753, 0.72901533,
0.75391257, 0.73340321, 0.69898672, 0.6364009 , 0.7761251 ,
0.66241546, 0.74020376, 0.79363585, 0.7202381 , 0.78539306,
0.74617737, 0.81994048, 0.68715659, 0.65657789, 0.73958333]), 'split7_test_score': array([0.71454545, 0.76056946, 0.74111257, 0.75786164, 0.79805492,
0.78050595, 0.66475973, 0.72490909, 0.72945276, 0.71477008,
0.84071317, 0.74237351, 0.73466505, 0.74925057, 0.79702504,
0.74460742, 0.69454938, 0.70023511, 0.76135266, 0.756304 ,
0.75766747, 0.76468345, 0.8137666 , 0.71534821, 0.6802862 ,
0.76264881, 0.7251333 , 0.71505666, 0.75139762, 0.74001193,
0.70813691, 0.81311976, 0.78236607, 0.71476844, 0.76983219,
0.75783699, 0.6736961 , 0.70226648, 0.66181595, 0.78063834,
0.84261501, 0.77835749, 0.66917718, 0.76982707, 0.72289404,
0.69402597, 0.70758929, 0.64409066, 0.75984889, 0.74330357]), 'split8_test_score': array([0.73126785, 0.68824405, 0.68806951, 0.68119451, 0.73846726,
0.71745455, 0.70192308, 0.71614583, 0.75108266, 0.69652619,
0.76041667, 0.76531493, 0.7043533 , 0.72311087, 0.75293887,
0.74072802, 0.71073718, 0.77584541, 0.69866497, 0.74107143,
0.72233073, 0.75589135, 0.72965772, 0.73532005, 0.75858967,
0.70530627, 0.79982421, 0.73916172, 0.75560036, 0.75446429,
0.75418275, 0.70137207, 0.84639498, 0.83797417, 0.70560936,
0.74763636, 0.76088253, 0.69067797, 0.61710165, 0.75213675,
0.71942697, 0.78863324, 0.74394587, 0.70703125, 0.74404762,
0.64833333, 0.72881044, 0.63783307, 0.80180365, 0.81254368]), 'split9_test_score': array([0.70660191, 0.66810345, 0.67247387, 0.70254545, 0.74657126,
0.75283324, 0.70836364, 0.67944317, 0.68802441, 0.68789351,
0.75981818, 0.75408333, 0.76883117, 0.76605213, 0.82337662,
0.66294643, 0.61956522, 0.76283482, 0.75889485, 0.73623534,
0.76407456, 0.70707071, 0.70975984, 0.68714689, 0.75195925,
0.81212798, 0.7315683 , 0.77585565, 0.73245614, 0.86111111,
0.73796509, 0.71599003, 0.73556395, 0.71919516, 0.74739583,
0.72269918, 0.715205 , 0.77083333, 0.69487923, 0.74750712,
0.6855177 , 0.76300476, 0.75436661, 0.7316064 , 0.78773585,
0.68224299, 0.67400419, 0.60347826, 0.78822511, 0.77008929]), 'mean_test_score': array([0.71301337, 0.71189013, 0.71141139, 0.73227598, 0.72633332,
0.71921451, 0.70464869, 0.69405298, 0.71384088, 0.7116096 ,
0.75573005, 0.74469818, 0.74792917, 0.76074231, 0.76961653,
0.7318626 , 0.70010813, 0.7417482 , 0.7444143 , 0.7459127 ,
0.74945447, 0.76129036, 0.75402302, 0.72279231, 0.75407603,
0.73712926, 0.76464395, 0.73351931, 0.75038198, 0.76337224,
0.71923788, 0.76099794, 0.75179767, 0.75681972, 0.72793624,
0.74949554, 0.7287043 , 0.73792055, 0.65962484, 0.75897194,
0.72856298, 0.76675033, 0.75509405, 0.72077499, 0.75984468,
0.71527081, 0.73213619, 0.63380202, 0.76426511, 0.76206307]), 'std_test_score': array([0.03427238, 0.03273707, 0.03545065, 0.02333004, 0.04249131,
0.04151635, 0.02407248, 0.0392344 , 0.03493135, 0.03571821,
0.04391995, 0.02599407, 0.04526762, 0.03083814, 0.03059578,
0.02936588, 0.03976445, 0.03786645, 0.02896369, 0.02052191,
0.03727233, 0.02596081, 0.0358425 , 0.02895023, 0.04826907,
0.03942538, 0.04033375, 0.03412255, 0.02637261, 0.04782755,
0.02476402, 0.0420374 , 0.04330698, 0.03322264, 0.04468985,
0.02254938, 0.03801385, 0.04483113, 0.03400624, 0.02192659,
0.04761786, 0.03983521, 0.04096882, 0.02465926, 0.02537111,
0.04425633, 0.04030502, 0.02916297, 0.04536709, 0.02924587]), 'rank_test_score': array([42, 43, 45, 29, 35, 39, 46, 48, 41, 44, 13, 23, 21, 9, 1, 31, 47,
25, 24, 22, 20, 7, 16, 36, 15, 27, 3, 28, 18, 5, 38, 8, 17, 12,
34, 19, 32, 26, 49, 11, 33, 2, 14, 37, 10, 40, 30, 50, 4, 6],
dtype=int32)}
print(bayes_search.best_score_)
0.7696165252454255
print(bayes_search.best_params_)
OrderedDict([('model__colsample_bytree', 0.13185918812684402), ('model__learning_rate', 0.011905257050905997), ('model__max_depth', 8), ('model__n_estimators', 249), ('model__subsample', 0.9494804275407859), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])
new_param_column_names = {
'model__max_depth': 'max_depth',
'model__n_estimators': 'n_estimators',
'model__learning_rate': 'learning_rate',
'model__colsample_bytree': 'colsample_bytree',
'model__subsample': 'subsample',
# 'pca__transformer': 'pca',
# 'pca__n_components': 'pca: n_comps',
# 'pca__gamma': 'pca: gamma',
# 'pca__kernel': 'pca: kernel',
'prep__non_numeric__encoder__transformer': 'encoder',
'prep__numeric__imputer__transformer': 'imputer',
'prep__numeric__scaler__transformer': 'scaler'
}
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = new_param_column_names
)
results.to_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
results.fit_time_averages
array([0.91540077, 2.97680652, 0.61755981, 3.1776381 , 1.42760015,
0.6581629 , 0.98548501, 1.42064655, 1.42922101, 0.14844317,
2.76263421, 0.04913921, 0.02552745, 0.32107105, 0.22884459,
2.78207996, 0.15752764, 0.63771639, 0.03724518, 0.20636985,
0.85205717, 0.08833435, 0.06047475, 0.03386319, 0.63402202,
0.04324572, 0.04619427, 0.82597058, 1.41828265, 0.37768147,
0.34136021, 0.0334723 , 0.44540069, 0.09616776, 0.30100787,
0.03956122, 0.99580045, 0.10397882, 0.5058147 , 1.00126624,
0.31424382, 0.29258344, 1.07777073, 3.56608882, 0.36573451,
0.90820949, 6.77010171, 0.0265013 , 0.39557285, 0.28505092])
results.best_primary_score
0.7696165252454255
results.best_primary_score_params
{'colsample_bytree': 0.13185918812684402,
'learning_rate': 0.011905257050905997,
'max_depth': 8,
'n_estimators': 249,
'subsample': 0.9494804275407859,
'encoder': 'CustomOrdinalEncoder()',
'imputer': 'SimpleImputer()',
'scaler': 'StandardScaler()'}
results.to_formatted_dataframe(num_rows=20)
| roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | colsample_bytree | learning_rate | max_depth | n_estimators | subsample | encoder | scaler |
|---|---|---|---|---|---|---|---|---|---|
| 0.770 | 0.748 | 0.792 | 0.132 | 0.012 | 8 | 249 | 0.949 | CustomOrdinalEncoder() | StandardScaler() |
| 0.767 | 0.738 | 0.795 | 0.594 | 0.011 | 8 | 160 | 0.294 | OneHotEncoder() | MinMaxScaler() |
| 0.765 | 0.736 | 0.793 | 0.108 | 0.047 | 6 | 50 | 1.000 | OneHotEncoder() | StandardScaler() |
| 0.764 | 0.732 | 0.797 | 0.010 | 0.010 | 4 | 1,516 | 0.422 | OneHotEncoder() | StandardScaler() |
| 0.763 | 0.729 | 0.798 | 0.012 | 0.092 | 3 | 1,250 | 0.189 | OneHotEncoder() | StandardScaler() |
| 0.762 | 0.741 | 0.783 | 0.400 | 0.010 | 10 | 270 | 0.425 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.761 | 0.743 | 0.780 | 0.193 | 0.060 | 8 | 50 | 1.000 | OneHotEncoder() | MinMaxScaler() |
| 0.761 | 0.731 | 0.791 | 0.054 | 0.032 | 7 | 50 | 1.000 | OneHotEncoder() | MinMaxScaler() |
| 0.761 | 0.739 | 0.783 | 0.277 | 0.011 | 10 | 244 | 0.977 | CustomOrdinalEncoder() | None |
| 0.760 | 0.742 | 0.778 | 0.686 | 0.010 | 9 | 276 | 0.166 | OneHotEncoder() | None |
| 0.759 | 0.743 | 0.775 | 0.520 | 0.010 | 10 | 895 | 0.201 | OneHotEncoder() | MinMaxScaler() |
| 0.757 | 0.733 | 0.781 | 0.305 | 0.036 | 7 | 50 | 1.000 | OneHotEncoder() | MinMaxScaler() |
| 0.756 | 0.724 | 0.787 | 1.000 | 0.010 | 3 | 2,000 | 1.000 | OneHotEncoder() | StandardScaler() |
| 0.755 | 0.726 | 0.784 | 0.456 | 0.010 | 3 | 1,642 | 0.298 | OneHotEncoder() | MinMaxScaler() |
| 0.754 | 0.720 | 0.789 | 0.172 | 0.010 | 10 | 2,000 | 0.119 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.754 | 0.728 | 0.780 | 0.263 | 0.010 | 7 | 50 | 0.804 | CustomOrdinalEncoder() | StandardScaler() |
| 0.752 | 0.721 | 0.783 | 0.010 | 0.058 | 3 | 2,000 | 0.100 | OneHotEncoder() | MinMaxScaler() |
| 0.750 | 0.732 | 0.769 | 0.436 | 0.010 | 3 | 1,742 | 0.890 | OneHotEncoder() | StandardScaler() |
| 0.749 | 0.733 | 0.766 | 0.068 | 0.026 | 7 | 50 | 1.000 | CustomOrdinalEncoder() | MinMaxScaler() |
| 0.749 | 0.723 | 0.776 | 0.327 | 0.010 | 7 | 864 | 1.000 | CustomOrdinalEncoder() | StandardScaler() |
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
results.primary_score_trial_ranking
array([42, 43, 45, 29, 35, 39, 46, 48, 41, 44, 13, 23, 21, 9, 1, 31, 47,
25, 24, 22, 20, 7, 16, 36, 15, 27, 3, 28, 18, 5, 38, 8, 17, 12,
34, 19, 32, 26, 49, 11, 33, 2, 14, 37, 10, 40, 30, 50, 4, 6])
# gives the
# e.g. results.primary_score_iteration_ranking of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because index 2 (i.e. 3rd iteration) was the best, so it is the first index;
# and index 0 (i.e. first iteration) was the was
results.primary_score_best_indexes
array([14, 41, 26, 48, 29, 49, 21, 31, 13, 44, 39, 33, 10, 42, 24, 22, 32,
28, 35, 20, 12, 19, 11, 18, 17, 37, 25, 27, 3, 46, 15, 36, 40, 34,
4, 23, 43, 30, 5, 45, 8, 0, 1, 9, 2, 6, 16, 7, 38, 47])
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(size='learning_rate', color='max_depth').show()
results.plot_performance_across_trials(size='learning_rate', color='encoder').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter(height=1000, width=1000 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params(height=800)
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='learning_rate',
size='colsample_bytree',
color='scaler'
)
results.plot_parameter_vs_parameter(parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='max_depth'
)
results.plot_parameter_vs_parameter(parameter_x='colsample_bytree',
parameter_y='learning_rate',
size='imputer')
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | colsample_bytree | learning_rate | max_depth | n_estimators | subsample | encoder | scaler | |
|---|---|---|---|---|---|---|---|---|
| 14 | 0.769617 | 0.131859 | 0.011905 | 8 | 249 | 0.949480 | CustomOrdinalEncoder() | StandardScaler() |
| 41 | 0.766750 | 0.593652 | 0.010807 | 8 | 160 | 0.294306 | OneHotEncoder() | MinMaxScaler() |
| 26 | 0.764644 | 0.108024 | 0.046847 | 6 | 50 | 1.000000 | OneHotEncoder() | StandardScaler() |
| 48 | 0.764265 | 0.010000 | 0.010000 | 4 | 1516 | 0.422431 | OneHotEncoder() | StandardScaler() |
| 29 | 0.763372 | 0.011565 | 0.091828 | 3 | 1250 | 0.188813 | OneHotEncoder() | StandardScaler() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'colsample_bytree': 'colsample_bytree',
'learning_rate': 'learning_rate',
'max_depth': 'max_depth',
'n_estimators': 'n_estimators',
'subsample': 'subsample',
'encoder': 'encoder',
'scaler': 'scaler'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ colsample_bytree + learning_rate + max_depth + n_estimators + subsample + encoder + scaler
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.362
Model: OLS Adj. R-squared: 0.238
Method: Least Squares F-statistic: 2.912
Date: Mon, 31 Jan 2022 Prob (F-statistic): 0.0114
Time: 08:07:08 Log-Likelihood: 121.22
No. Observations: 50 AIC: -224.4
Df Residuals: 41 BIC: -207.2
Df Model: 8
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
Intercept 0.7523 0.018 42.907 0.000 0.717 0.788
encoder[T.OneHotEncoder()] 0.0010 0.008 0.126 0.901 -0.016 0.018
scaler[T.None] -0.0019 0.012 -0.161 0.873 -0.026 0.022
scaler[T.StandardScaler()] -0.0126 0.008 -1.647 0.107 -0.028 0.003
colsample_bytree 0.0003 0.010 0.030 0.976 -0.020 0.021
learning_rate -0.1623 0.037 -4.393 0.000 -0.237 -0.088
max_depth -0.0004 0.002 -0.231 0.819 -0.003 0.003
n_estimators -1.966e-06 5e-06 -0.393 0.696 -1.21e-05 8.14e-06
subsample 0.0084 0.010 0.805 0.425 -0.013 0.029
==============================================================================
Omnibus: 46.267 Durbin-Watson: 0.756
Prob(Omnibus): 0.000 Jarque-Bera (JB): 234.076
Skew: -2.298 Prob(JB): 1.48e-51
Kurtosis: 12.552 Cond. No. 1.25e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.25e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'colsample_bytree', 'learning_rate', 'max_depth', 'n_estimators', 'subsample'] ['encoder', 'scaler']
| roc_auc_Mean | colsample_bytree | learning_rate | max_depth | n_estimators | subsample | encoder | scaler | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1.28169 | -0.752182 | -0.707525 | 0.486278 | -0.853594 | 0.924709 | CustomOrdinalEncoder() | StandardScaler() |
| 1 | 1.17484 | 0.580773 | -0.718861 | 0.486278 | -0.977332 | -0.991659 | OneHotEncoder() | MinMaxScaler() |
| 2 | 1.096314 | -0.820983 | -0.346817 | -0.298042 | -1.130266 | 1.072477 | OneHotEncoder() | StandardScaler() |
| 3 | 1.082191 | -1.103926 | -0.727193 | -1.082361 | 0.907927 | -0.616895 | OneHotEncoder() | StandardScaler() |
| 4 | 1.048905 | -1.099408 | 0.117515 | -1.474521 | 0.538105 | -1.300221 | OneHotEncoder() | StandardScaler() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['colsample_bytree'] = score_dataframe_transformed['colsample_bytree'].astype('float')
score_dataframe_transformed['learning_rate'] = score_dataframe_transformed['learning_rate'].astype('float')
score_dataframe_transformed['max_depth'] = score_dataframe_transformed['max_depth'].astype('float')
score_dataframe_transformed['n_estimators'] = score_dataframe_transformed['n_estimators'].astype('float')
score_dataframe_transformed['subsample'] = score_dataframe_transformed['subsample'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ colsample_bytree + learning_rate + max_depth + n_estimators + subsample + encoder + scaler
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.362
Model: OLS Adj. R-squared: 0.238
Method: Least Squares F-statistic: 2.912
Date: Mon, 31 Jan 2022 Prob (F-statistic): 0.0114
Time: 08:07:08 Log-Likelihood: -59.698
No. Observations: 50 AIC: 137.4
Df Residuals: 41 BIC: 154.6
Df Model: 8
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
Intercept 0.1994 0.293 0.680 0.500 -0.393 0.791
encoder[T.OneHotEncoder()] 0.0384 0.305 0.126 0.901 -0.578 0.655
scaler[T.None] -0.0720 0.449 -0.161 0.873 -0.978 0.834
scaler[T.StandardScaler()] -0.4695 0.285 -1.647 0.107 -1.045 0.106
colsample_bytree 0.0040 0.133 0.030 0.976 -0.264 0.272
learning_rate -0.5862 0.133 -4.393 0.000 -0.856 -0.317
max_depth -0.0334 0.145 -0.231 0.819 -0.326 0.259
n_estimators -0.0527 0.134 -0.393 0.696 -0.324 0.218
subsample 0.1071 0.133 0.805 0.425 -0.162 0.376
==============================================================================
Omnibus: 46.267 Durbin-Watson: 0.756
Prob(Omnibus): 0.000 Jarque-Bera (JB): 234.076
Skew: -2.298 Prob(JB): 1.48e-51
Kurtosis: 12.552 Cond. No. 5.03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | 0.038388 | 0.900558 | False |
| scaler[T.None] | scaler[T.None] | -0.071999 | 0.873255 | False |
| scaler[T.StandardScaler()] | scaler[T.StandardScaler()] | -0.469507 | 0.107154 | False |
| colsample_bytree | colsample_bytree | 0.003974 | 0.976248 | False |
| learning_rate | learning_rate | -0.586246 | 0.000077 | True |
| max_depth | max_depth | -0.033448 | 0.818702 | False |
| n_estimators | n_estimators | -0.052716 | 0.696414 | False |
| subsample | subsample | 0.107104 | 0.425320 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
estimator = bayes_search.best_estimator_
start_time = time.time()
result = permutation_importance(
estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 4.324 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({results.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.